C. Anderson provokingly in the magazine Wired:

“The End of Theory: the data deluge makes the scientific method obsolete”

Why model?

J. Epstein (2008, JASSS): ‘Why model?’

A model is a simplified representation/abstraction of a target system, which implements some theoretical propositions about the logical linkages between objects of interest.

In this tutorial, two kinds of models are presented:

More generally, we want to show how the two shed complementary lights on spatial problems and how they interact with the new massive data.

Case study: urban segregation with scraped AirBnB data

The data

Let’s bind together the data for all three cities

f = list.files('insideairbnb/')

mont <- read.csv(paste0('insideairbnb/', f[1]))
toro <- read.csv(paste0('insideairbnb/', f[2]))
vanc <- read.csv(paste0('insideairbnb/', f[3]))

toronto = unique(toro$city)
vancouver = unique(vanc$city)
montreal = unique(mont$city)

df = rbind(mont, toro, vanc)

Cleaning here is choosing how to make the data at hand in line with the purpose of the analysis.

First, if we want to take airbnb listings as proxies for residents, we need to identify and remove commercial lettings, as well as listings in neighbourhoods which are different from the host’s neighbourhood, as well as multiproperties.

l <- levels(df$property_type)
lookup = data.frame('type' = 1:length(l))
lookup$type <- as.factor(l)
lookup$property_group <- c(
# [1] "Aparthotel"             "Apartment"              "Bed and breakfast"      "Boat"                   "Boutique hotel"         "Bungalow"               "Cabin"                 
'hotel', 'home', 'hotel', 'other', 'hotel', 'home', 'other',
  #  [8] "Camper/RV"              "Campsite"               "Casa particular (Cuba)" "Cave"                   "Chalet"                 "Condominium"            "Cottage"               
'other', 'other', 'home', 'other', 'home', 'home', 'home',
# [15] "Farm stay"              "Guest suite"            "Guesthouse"             "Hostel"                 "Hotel"                  "House"                  "Houseboat"             
'home', 'home', 'hotel', 'hotel', 'hotel', 'home', 'home',
# [22] "Hut"                    "Loft"                   "Nature lodge"           "Other"                  "Serviced apartment"     "Tent"                   "Timeshare"             
'other', 'home', 'other', 'other', 'hotel', 'other', 'other',
# [29] "Tiny house"             "Townhouse"              "Villa"                  "Barn"                   "Castle"                 "Dorm"                   "Earth house"           
'home', 'home', 'home', 'home', 'home', 'hotel', 'other',
# [36] "In-law"                 "Parking Space"          "Treehouse"              "Resort"            
'home', 'other', 'other', 'hotel'
)

df = data.frame(df,lookup[match(df$property_type, lookup$type),] )
 # dfh = subset(df, property_group == 'home' & as.character(df$host_neighbourhood) == as.character(df$neighbourhood) & df$room_type != "Shared room")
dfh = subset(df, property_group == 'home' & as.character(df$host_neighbourhood) == as.character(df$neighbourhood))
dfh$property_group <- NULL
dfhu = dfh[!duplicated(dfh$host_id),]
dim(df)
## [1] 43211    98
dim(dfhu)
## [1] 24404    97

then we want to keep current listings, i.e. whose last review dates back from less than 2 years for example

dfhu$year = as.numeric(substr(dfhu$last_review, 1, 4))

 # pal <- colorFactor(
 #    palette =  brewer.pal(n=10, 'Blues'),
 #    domain = dfhu$year
 #  )

# leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
#   addCircleMarkers(
#     data = dfhu,
#         radius = ~ sqrt(0.07 * numPrice),
#     lat = ~ latitude,
#     color = ~pal(year),
#     stroke = FALSE,
#     fillOpacity = 0.5,
#     layerId = ~ id,
#     lng = ~ longitude
#   ) %>%  addLegend(pal = pal, position = 'topleft', values = dfhu$year)
# 

dfhun = subset(dfhu, year >= 2017)
dim(df)[1]
## [1] 43211
dim(dfhun)[1]
## [1] 16905

Then, we want to use a price variable that is usable (e.g. numeric) and to normalise the price by a measure of size: the number of rooms because square feet is mostly not filled by hosts. This implies to remove the small number of shared rooms

dfhun$numPrice <- as.numeric(gsub("[$]",'',dfhun$price))
## Warning: NAs introduced by coercion
summary(dfhun$room_type)
## Entire home/apt    Private room     Shared room 
##           12163            4592             150
final = subset(dfhun, room_type != "Shared room")

final$rooms = ifelse(final$bedrooms == 0, 1, final$bedrooms)
final$priceperroom = as.numeric(ifelse(final$room_type == T,  final$numPrice,  final$numPrice / final$rooms))

Let’s retrieve census shape files and data from the census API

and map!

  pal <- colorQuantile(
     palette =  'Blues',
     domain = city_data$priceperroom,
     n = 10
   )

  map <- leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
      addPolygons(
      data = csd.csd.geo,
      color = 'black',
      fill = F,
      weight = 0.7,
      opacity = 0.9
    ) %>% addPolygons(
      data = csd.geo,
      color = 'grey',
      fill = F,
       weight = 0.4
    ) %>%
  addCircleMarkers(
    data = city_data,
        radius = ~ sqrt(4 * rooms),
    lat = ~ latitude,
    fillColor = ~ pal(priceperroom),
    color = 'black',
    stroke = T,
    fillOpacity = 0.5,
   weight = 0.1,
    layerId = ~ id,
    lng = ~ longitude
  ) %>% 
  addLegend(pal = pal, position = 'topleft', values = city_data$priceperroom)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
 map

and map!

  pal <- colorQuantile(
     palette =  'Blues',
     domain = city_data$priceperroom,
     n = 10
   )
      pal2 <- colorQuantile(
     palette =  'Reds',
     domain = -tractTable$Ei,
     n = 10)

   map <- leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
      addPolygons(
      data = csd.csd.geo,
      color = 'black',
      fill = F,
      weight = 0.7,
      opacity = 0.9
    ) %>% addPolygons(
      data = tractTable,
      color =  ~ pal2(-Ei),
      fill = ~ pal2(-Ei),
       weight = 0.4
    ) %>%
  #addCircleMarkers(
  #   data = city_data,
  #       radius = ~ sqrt(4 * rooms),
  #   lat = ~ latitude,
  #   fillColor = ~ pal(priceperroom),
  #   color = 'black',
  #   stroke = T,
  #   fillOpacity = 0.5,
  #  weight = 0.1,
  #   layerId = ~ id,
  #   lng = ~ longitude
  # ) %>% 
# addLegend(pal = pal, position = 'topleft', values = city_data$priceperroom)%>% 
  addLegend(pal = pal2, position = 'topleft', values = -tractTable$Ei)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors
   map

Let’s use the information theory to qualify diversity and segregation of a given city (cf. John Iceland et al on multigroup entropy: https://www.census.gov/hhes/www/housing/resseg/multigroup_entropy.pdf). The measure have been implemented are described below and tested on Canadian metropolisis from package cancensus (example by @dshkol: https://github.com/dshkol/scratchpad/blob/master/content/post/2018-05-10-diversity-and-segregation-i.Rmd).